<!DOCTYPE html>
<html>
<head><meta name="generator" content="Hexo 3.9.0">
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  
  <title>预训练语言模型-神经网络语言模型：LSTMLM | Rogerspy&#39;s Home</title>
  
  <meta name="keywords" content="Machine Learning, Deep Learning, NLP">
  
  

  
  <link rel="alternate" href="/atom.xml" title="Rogerspy's Home">
  

  <meta name="HandheldFriendly" content="True">
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
  <!-- meta -->
  
  
  <meta name="theme-color" content="#FFFFFF">
  <meta name="msapplication-TileColor" content="#1BC3FB">
  <meta name="msapplication-config" content="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/browserconfig.xml">
  

  <!-- link -->
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.css">
  
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/node-waves@0.7.6/dist/waves.min.css">
  
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.10.1/css/all.min.css">
  
  
  <link rel="shortcut icon" type="image/x-icon" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicon.ico">
  <link rel="icon" type="image/x-icon" sizes="32x32" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/favicon-32x32.png">
  <link rel="apple-touch-icon" type="image/png" sizes="180x180" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/apple-touch-icon.png">
  <link rel="mask-icon" color="#1BC3FB" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/safari-pinned-tab.svg">
  <link rel="manifest" href="https://cdn.jsdelivr.net/gh/xaoxuu/assets@master/favicon/favicons/site.webmanifest">
  

  

  
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-material-x@19.5/css/style.css">
  

  <script>
    function setLoadingBarProgress(num) {
      document.getElementById('loading-bar').style.width=num+"%";
    }
  </script>
  

  
  
  <!-- 时间线 -->
  <link rel="stylesheet" href="/css/timeline.css">
  <!-- 血小板-->
  <link rel="stylesheet" href="/live2d/css/live2d.css">
  <style>
	.article p .mjx-math {
	    font-family: Menlo,Monaco,courier,monospace,"Lucida Console",'Source Code Pro',"Microsoft YaHei",Helvetica,Arial,sans-serif,Ubuntu;
        background: none;
        padding: 2px;
        border-radius: 4px;
	}
  </style>
</head>

<body>
  
  
  <header class="l_header pure">
  <div id="loading-bar-wrapper">
    <div id="loading-bar" class="pure"></div>
  </div>

	<div class='wrapper'>
		<div class="nav-main container container--flex">
      <a class="logo flat-box" href='/' >
        
          Rogerspy's Home
        
      </a>
			<div class='menu navgation'>
				<ul class='h-list'>
          
  					
  						<li>
								<a class="nav flat-box" href="/blog/"
                  
                  
                  id="blog">
									<i class='fas fa-edit fa-fw'></i>&nbsp;博客
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/video/"
                  
                  
                  id="video">
									<i class='fas fa-film fa-fw'></i>&nbsp;视频小站
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/material/"
                  
                  
                  id="material">
									<i class='fas fa-briefcase fa-fw'></i>&nbsp;学习资料
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/diary/"
                  
                  
                  id="diary">
									<i class='fas fa-book fa-fw'></i>&nbsp;随心记
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/categories/"
                  
                    rel="nofollow"
                  
                  
                  id="categories">
									<i class='fas fa-folder-open fa-fw'></i>&nbsp;分类
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/tags/"
                  
                    rel="nofollow"
                  
                  
                  id="tags">
									<i class='fas fa-hashtag fa-fw'></i>&nbsp;标签
								</a>
							</li>
      			
  						<li>
								<a class="nav flat-box" href="/blog/archives/"
                  
                    rel="nofollow"
                  
                  
                  id="blogarchives">
									<i class='fas fa-archive fa-fw'></i>&nbsp;归档
								</a>
							</li>
      			
      		
				</ul>
			</div>

			
				<div class="m_search">
					<form name="searchform" class="form u-search-form">
						<input type="text" class="input u-search-input" placeholder="搜索" />
						<i class="icon fas fa-search fa-fw"></i>
					</form>
				</div>
			
			<ul class='switcher h-list'>
				
					<li class='s-search'><a class="fas fa-search fa-fw" href='javascript:void(0)'></a></li>
				
				<li class='s-menu'><a class="fas fa-bars fa-fw" href='javascript:void(0)'></a></li>
			</ul>
		</div>

		<div class='nav-sub container container--flex'>
			<a class="logo flat-box"></a>
			<ul class='switcher h-list'>
				<li class='s-comment'><a class="flat-btn fas fa-comments fa-fw" href='javascript:void(0)'></a></li>
        
          <li class='s-toc'><a class="flat-btn fas fa-list fa-fw" href='javascript:void(0)'></a></li>
        
			</ul>
		</div>
	</div>
</header>
	<aside class="menu-phone">
    <header>
		<nav class="menu navgation">
      <ul>
        
          
            <li>
							<a class="nav flat-box" href="/"
                
                
                id="home">
								<i class='fas fa-clock fa-fw'></i>&nbsp;近期文章
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/blog/archives/"
                
                  rel="nofollow"
                
                
                id="blogarchives">
								<i class='fas fa-archive fa-fw'></i>&nbsp;文章归档
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/blog/"
                
                
                id="blog">
								<i class='fas fa-edit fa-fw'></i>&nbsp;我的博客
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/video/"
                
                  rel="nofollow"
                
                
                id="video">
								<i class='fas fa-film fa-fw'></i>&nbsp;我的视频
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/material/"
                
                  rel="nofollow"
                
                
                id="material">
								<i class='fas fa-briefcase fa-fw'></i>&nbsp;学习资料
							</a>
            </li>
          
            <li>
							<a class="nav flat-box" href="/about/"
                
                  rel="nofollow"
                
                
                id="about">
								<i class='fas fa-info-circle fa-fw'></i>&nbsp;关于小站
							</a>
            </li>
          
       
      </ul>
		</nav>
    </header>
	</aside>
<script>setLoadingBarProgress(40);</script>



  <div class="l_body nocover">
    <div class='body-wrapper'>
      <div class='l_main'>
  

  
    <article id="post" class="post white-box article-type-post" itemscope itemprop="blogPost">
      


  <section class='meta'>
    
    
    <div class="meta" id="header-meta">
      
        
  
    <h1 class="title">
      <a href="/2021/03/31/neural-language-model-lstm/">
        预训练语言模型-神经网络语言模型：LSTMLM
      </a>
    </h1>
  


      
      <div class='new-meta-box'>
        
          
        
          
            
  <div class='new-meta-item author'>
    <a href="https://rogerspy.gitee.io" rel="nofollow">
      
        <i class="fas fa-user" aria-hidden="true"></i>
      
      <p>Rogerspy</p>
    </a>
  </div>


          
        
          
            <div class="new-meta-item date">
  <a class='notlink'>
    <i class="fas fa-calendar-alt" aria-hidden="true"></i>
    <p>2021-03-31</p>
  </a>
</div>

          
        
          
            
  
  <div class='new-meta-item category'>
    <a href='/categories/语言模型/' rel="nofollow">
      <i class="fas fa-folder-open" aria-hidden="true"></i>
      <p>语言模型</p>
    </a>
  </div>


          
        
          
            
  
    <div class="new-meta-item browse busuanzi">
      <a class='notlink'>
        <i class="fas fa-eye" aria-hidden="true"></i>
        <p>
          <span id="busuanzi_value_page_pv">
            <i class="fas fa-spinner fa-spin fa-fw" aria-hidden="true"></i>
          </span>
        </p>
      </a>
    </div>
  


          
        
          
            

          
        
          
            
  
    <div style="margin-right: 10px;">
      <span class="post-time">
        <span class="post-meta-item-icon">
          <i class="fa fa-keyboard"></i>
          <span class="post-meta-item-text">  字数统计: </span>
          <span class="post-count">3.5k字</span>
        </span>
      </span>
      &nbsp; | &nbsp;
      <span class="post-time">
        <span class="post-meta-item-icon">
          <i class="fa fa-hourglass-half"></i>
          <span class="post-meta-item-text">  阅读时长≈</span>
          <span class="post-count">13分</span>
        </span>
      </span>
    </div>
  

          
        
      </div>
      
        <hr>
      
    </div>
  </section>


      <section class="article typo">
        <div class="article-entry" itemprop="articleBody">
          <h1 id="1-简介"><a href="#1-简介" class="headerlink" title="1. 简介"></a>1. 简介</h1><p><em>Mikolov</em> 等人提出的 <em>RNN</em> 语言模型解决了前馈神经网络语言模型的语序问题。但是由于 <em>RNN</em> 神经网络本身存在着长程依赖问题，导致 <em>RNN</em> 语言模型很难学到距离较远的信息。</p>
<a id="more"></a>
<p>比如：“我的家乡是广东，广东有很多好吃的，我最喜欢的是海鲜，我们的方言是粤语…” 假设有这样一个句子，我们想通过前文去预测 “粤语” 这个词，显然它是和 “广东” 相关联的信息。但是我们会发现 “广东” 在句子中距离 “粤语” 很远。<em>RNN</em> 很难学到这样远距离的信息，关于为什么会出现这样的情况可以参考 <a href="http://people.idsia.ch/~juergen/SeppHochreiter1991ThesisAdvisorSchmidhuber.pdf" target="_blank" rel="noopener">Hochreiter &amp; German (1991)</a> 和 <a href="http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf" target="_blank" rel="noopener">Bengio, et al. (1994)</a> 两篇文章，简单来说就是因为 <em>RNN</em> 循环过程中时间步之间是连乘的关系，一旦出现较大或者较小的值，经过连乘就会发生梯度爆炸或者梯度消失的情况。出现这种情况以后模型就学不到什么东西了。</p>
<p>为了解决这个问题，<a href="http://www.bioinf.jku.at/publications/older/2604.pdf" target="_blank" rel="noopener">Hochreiter &amp; Schmidhuber (1997)</a> 提出了长短期记忆网络（<em>Long Short Term Memory networks</em>，即所谓的 <em>LSTM</em> 网络）。使用 <em>LSTM</em> 来构建语言模型可以避免由 <em>RNN</em> 带来的长程依赖问题。</p>
<h1 id="2-LSTM-语言模型"><a href="#2-LSTM-语言模型" class="headerlink" title="2. LSTM 语言模型"></a>2. LSTM 语言模型</h1><h2 id="2-1-LSTM-神经网络简介"><a href="#2-1-LSTM-神经网络简介" class="headerlink" title="2.1 LSTM 神经网络简介"></a>2.1 LSTM 神经网络简介</h2><p>长短期记忆网络（Long Short Term Memory networks），通常简称为“LSTM”，是一种特殊的RNN，它能够规避掉长期依赖学习问题。它是由 <a href="http://www.bioinf.jku.at/publications/older/2604.pdf" target="_blank" rel="noopener">Hochreiter &amp; Schmidhuber (1997)</a> 提出的，并且经过很多人的改进。</p>
<p>LSTM被设计出来用以解决长期依赖问题。<strong>长时间记住信息实际上是他们的默认行为，而不是他们努力学习的东西！</strong>(<em>Remembering information for long periods of time is practically their default behavior, not something they struggle to learn!</em>)</p>
<p>所有的循环神经网络都有着重复模块的链式神经网络结构。标准的RNN的重复模块有非常简单的结构，比如单个 <em>tanh</em> 层。LSTM 也有这种类似的链式结构，但是重复模块却有着不同的结构，不同于单一网络层结构，LSTM 的基本结构如下图。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/lSTM.png" height="170" width="450"></p>
<p>对比 RNN 的结构，我们会发现，LSTM 要复杂得多。简单来说，LSTM 通过三个门控来调节当前神经元中学习到的信息，避免梯度消失或者爆炸。</p>
<blockquote>
<p>图中 $\sigma$ 表示 <em>sigmoid</em> 函数：</p>
<script type="math/tex; mode=display">
\sigma(z) = \frac{\exp(z)}{\sum_i \exp(z)} \in (0, 1)</script><p>$\tanh$ 表示 <em>tanh</em> 函数：</p>
<script type="math/tex; mode=display">
\tanh(x) = \frac{\exp(x)-\exp(-x)}{\exp(x)+\exp(-x)} \in [-1, 1]</script></blockquote>
<ul>
<li><p><em>cell</em> 状态：LSTM 最重要的就是 <em>cell</em> 状态 $\vec{C}_t$，表示当前时间神经网络学习到的信息；</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/cellstate.png" width="450" height="170"></p>
</li>
<li><p>遗忘门：控制上一个 <em>cell</em> 中有多少信息会进入到当前的 <em>cell</em>；</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/forget.png" width="450" height="170"></p>
</li>
<li><p>输入门：控制输入层有多少信息会进入到当前 <em>cell</em> 中；</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/input.png" width="450" height="170"></p>
</li>
<li><p>输出门：控制当前 <em>cell</em> 有多少信息可以用于输出。</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/output.png" width="450" height="170"></p>
</li>
</ul>
<h2 id="2-2-LSTM-语言模型"><a href="#2-2-LSTM-语言模型" class="headerlink" title="2.2 LSTM 语言模型"></a>2.2 LSTM 语言模型</h2><p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/20210408163912.png" alt></p>
<p>从之前的神经网络语言模型中，我们会发现一些规律：</p>
<ul>
<li>输入词是通过 <em>1-of-K</em> 编码的，其中 $K$ 是词表大小；</li>
<li>输出层通过 <em>softmax</em> 得到一个归一化的概率分布；</li>
<li>训练过程使用交叉熵损失函数，等价于最大似然估计。</li>
</ul>
<p><em>Sundermeyer</em> 等人也使用了相同的方法，用来你构建 LSTM 语言模型。首先将输入层的词经过一个投影层，转化成词嵌入（实际上就是 <em>Embedding</em> 过程），然后传递给 LSTM，最后经过 <em>softmax</em> 进行输出。</p>
<p>对于大规模的语言模型训练来说，<em>softmax</em> 层的计算消耗了大量的时间：</p>
<script type="math/tex; mode=display">
a_i = \sum_{j=1}^J \omega_{ij} b_j</script><p>其中 $J$ 表示 LSTM 隐层节点数，$\omega_{ij}$ 表示 LSTM 层与输出层的权重，$i=1,…,V$ ，其中 $V$ 表示词表大小。</p>
<p>为了降低计算时间，<em>Morin &amp;  Bengio</em> 、<em>Goodman</em> 提出将词进行分类，然后预测下一个词所在的类别，然后再预测具体的词：</p>
<script type="math/tex; mode=display">
p(w_m|w_{1:m-1}) = p(w_m|c(w_m),w_{1:m-1})p(c(w_m)|w_{1:m-1})</script><p>其中 $w_m \in c(w_m)$，$c(w_m)$ 表示 $w_m$  所在的类别。</p>
<h2 id="2-3-AWD-LSTM-语言模型"><a href="#2-3-AWD-LSTM-语言模型" class="headerlink" title="2.3 AWD-LSTM 语言模型"></a>2.3 AWD-LSTM 语言模型</h2><p>LSTM 作为 RNN 最优秀的变种之一，在进行语言建模的时候也有着相当优秀的表现。但是 作为神经网络，LSTM 也存在着泛化性问题。通常为了提高神经网络的泛化性，人们提出了各种各样的正则化策略。</p>
<p>AWD-LSTM 提出了一些正则化和优化策略，这些策略不仅高效，而且可以在不改变 LSTM 结构的条件下实现。它在语言建模上的优异表现使得它一度成为最优秀的语言模型。下面我们就介绍一下这个模型。</p>
<blockquote>
<p>LSTM 数学公式：</p>
<script type="math/tex; mode=display">
f_t=\sigma(W_f\cdot [h_{t-1}, x_t]+b_f) \\\\
i_t = \sigma(W_i\cdot[h_{t-1}, x_t]+b_i) \\\\
\widetilde{C}_t = \tanh(W_C\cdot [h_{t-1}]+b_C) \\\\
o_t = \sigma(W_o \cdot [h_{h-1}, x_t] +b_o) \\\\
C_t = i_t * \widetilde{C}_t + f_t * \widetilde{C}_{t-1} \\\\
h_t = o_t * \tanh(C_t)</script></blockquote>
<h3 id="2-3-1-weight-dropped-LSTM"><a href="#2-3-1-weight-dropped-LSTM" class="headerlink" title="2.3.1 weight-dropped LSTM"></a>2.3.1 weight-dropped LSTM</h3><p><em>Dropout</em> 是神经网络中常用的防止过拟合的方法，但是用在 <em>RNN</em> 型的网络中通常效果不佳。这与 <em>Dropout</em> 的原理有关，见下图中间：</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/nn_do1.jpg" alt></p>
<p><em>Dropout</em> 会随机丢掉一些神经元，即将神经元节点置为零。这样 $h_t$ 接收到的 $h_{t-1}$ 就不完整了，会干扰 <em>RNN</em>  的长程依赖能力。为了解决这一问题，<a href="http://proceedings.mlr.press/v28/wan13.pdf" target="_blank" rel="noopener"><em>Wan</em></a> 等人提出 <em>DropConnect</em> 技术，如上图右侧。不同于 <em>Dropout</em> 的丢掉神经元，<em>DropConnect</em> 是随机丢掉一些权重，完整的保留了神经元。用伪代码来说明如下：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># Dropout</span></span><br><span class="line">h_1 = RNNCell(x)</span><br><span class="line">h_2 = Dropout(h_1)</span><br><span class="line"></span><br><span class="line"><span class="comment"># DropConnect</span></span><br><span class="line">h_1 = RNNCell(x)</span><br><span class="line">h_2 = Dropout(h_1.weights)</span><br></pre></td></tr></table></figure>
<p>这样就不会影响到 <em>RNN</em> 的长程依赖能力了。</p>
<p>LSTM 的权重参数包括 $[W_f, W_i, W_C, W_o, U_f, U_i, U_C, U_o]$，其中 $W^{*}$ 是与输入 $x_t$ 相关的， $U^{*}$ 是与隐状态相关的。LSTM 的梯度问题通常与隐状态有关（循环连乘带来的梯度消失或者爆炸），因此将 <em>DropConnect</em> 应用于 $U^{*}$ 上效果更好（当然，$W^{*}$ 和 $U^{*}$ 都用也行，只是考虑到以牺牲效率为代价换来的效果提升并不明显）。</p>
<h3 id="2-3-2-Non-monotonically-Triggered-ASGD"><a href="#2-3-2-Non-monotonically-Triggered-ASGD" class="headerlink" title="2.3.2 Non-monotonically Triggered ASGD"></a>2.3.2 Non-monotonically Triggered ASGD</h3><p>对于语言建模任务来说，传统的 SGD 优化算法比带动量的 SGD 变体效果更好。因此，作者在调研了一些传统 SGD 算法之后选定了 ASGD 算法。</p>
<p>所谓 ASGD 算法指的是 Averaged SGD 算法，它是 <a href="https://epubs.siam.org/doi/abs/10.1137/0330046?journalCode=sjcodc" target="_blank" rel="noopener">Polyak &amp; Juditsky</a> 等人 1992 年提出的一种优化算法，经过了二十多年的研究发展，ASGD 已经非常成熟，无论是理论研究还是实际表现都非常出色。</p>
<p>ASGD 采取和 SGD 相同的更新步骤 ，不同的是传统 SGD 在更新权重的时候只考虑当前的轮次，而 ASGD 不仅考虑当前的的轮次还考虑之前的轮次，然后计算平均值。用伪代码来表示如下：</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line"># 传统 SGD</span><br><span class="line">w_t= w_t_1 - lr * grad(w_t_1)</span><br><span class="line"></span><br><span class="line"># ASGD</span><br><span class="line">avg_fact = 1 / max(t - K, 1)</span><br><span class="line"></span><br><span class="line">if avg_fact != 1:</span><br><span class="line">    w_t = avg_fact * (sum(w_t_1) + (w_t_1 - lr_t * grad(w_t_1)))</span><br><span class="line">else:</span><br><span class="line">    w_t = w_t_1 - lr_t * grad(w_t_1)</span><br></pre></td></tr></table></figure>
<p>其中 $K$ 表示在计算权重平均值之前权重更新的迭代的次数，也就是说，前 $K$ 轮的 ASGD 与 SGD 是完全相同的。</p>
<p>但是作者认为这种方法有两处不足：</p>
<ol>
<li>学习率的调整原则不明确；</li>
<li>参数 $K$ 作为超参，其取值原则也不明确。$K$ 值太小会对效果产生负面影响；取值太大可能需要更多的迭代才能收敛。</li>
</ol>
<p>因此，作者提出了 ASGD 的一种变体—— NT-ASGD，即非单调触发 ASGD（<em>Non-monotonically Triggered ASGD</em>），算法如下：</p>
<p><img src="https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/20210409194127.png" alt></p>
<ul>
<li>当模型评估指标多轮训练（$n$）后都没有提升的时候 ASGD 就会触发，实验发现 $n=5$ 的效果最好；</li>
<li>整个实验使用恒定的学习率。</li>
</ul>
<h3 id="2-3-3-其他正则化方法"><a href="#2-3-3-其他正则化方法" class="headerlink" title="2.3.3 其他正则化方法"></a>2.3.3 其他正则化方法</h3><p>除了上面讨论到的两种技术，论文作者还使用了其他预防过拟合、提升数据效率的正则化技术。</p>
<h4 id="2-3-3-1-可变长度反向传播序列"><a href="#2-3-3-1-可变长度反向传播序列" class="headerlink" title="2.3.3.1 可变长度反向传播序列"></a>2.3.3.1 可变长度反向传播序列</h4><p>一般在训练语言模型的时候，将整个语料看成一个连续的超长的句子，在预处理的时候会将句子截断成固定长度的 <em>batch size</em> 个序列。这样由于句子被截断，在后向传播的过程中神经网络学到的信息就不玩完整了。比如：</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">原始语料：“我是中国人。我爱北京天安门。”</span><br><span class="line">预处理后：[</span><br><span class="line">    &quot;我是中国人。我&quot;,</span><br><span class="line">    &quot;爱北京天安门。&quot;</span><br><span class="line">]</span><br></pre></td></tr></table></figure>
<p>“我爱北京天安门。”这句话中的 “我” 就无法学到任何信息，因为它后面的内容被截断了。</p>
<p>为了解决这个问题，作者提出了使用可变长度的反向传播序列。首先以概率 $p$ 选取长度为 $bptt$ 的序列，然后以概率 $1-p$ 选取长度度为 $bptt/2$ 的序列。($p$ 是个超参数，实验中作者选用的 $p=0.95$)。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">base_bptt = bptt <span class="keyword">if</span> np.random.random() &lt; <span class="number">0.95</span> <span class="keyword">else</span> bptt / <span class="number">2</span></span><br></pre></td></tr></table></figure>
<p>然后根据 $N(base\_bptt, s)$ 得到序列长度，其中 $s$ 表示标准差，$N$ 表示正态分布。代码如下：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">seq_len = max(<span class="number">5</span>, int(np.random.normal(base_bptt, <span class="number">5</span>)))</span><br></pre></td></tr></table></figure>
<p>然后再根据 <code>seq_len</code>  改变学习率。因为当学习速率固定时，会更倾向于对短序列，所以需要进行缩放。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">lr2 = lr * seq_len / bptt</span><br></pre></td></tr></table></figure>
<blockquote>
<p>作者的这种做法其实还是引入了很多超参。其实还有一种更好的方法，可以在固定长度的 BPTT 下，不影响效果。</p>
<p>上面的例子中，“我是中国人。我爱北京天安门。”被分成了 [“我是中国人。我”,  “爱北京天安门。”]。这是通常的做法。我们还可以用下面的这种方法：</p>
<p>原始语料：“我是中国人。我爱北京天安门。”<br>预处理后：[</p>
<p>​    “我是中国人。我”,</p>
<p>​    “是中国人。我爱”,</p>
<p>​    “中国人。我爱北”,</p>
<p>​    …</p>
<p>​    “爱北京天安门。”</p>
<p>]</p>
</blockquote>
<h4 id="2-3-3-2-变分-Dropout"><a href="#2-3-3-2-变分-Dropout" class="headerlink" title="2.3.3.2 变分 Dropout"></a>2.3.3.2 变分 Dropout</h4><p>通常情况下，每次调用 <em>Dropout</em> 时取样一个新的 <em>dropout mask</em>。但是在 LSTM 中参数是共享的，作者希望在不同的时刻共享的参数也共享同一套 <em>dropout mask</em>，这就是 <em>variational dropout</em>，在隐层作者使用了共享<em>mask</em> 的 <em>dropConnect</em>，而在输入和输出中，作者使用共享 <em>mask</em> 的 <em>variational dropout</em>。但是请注意在不同的 <em>mini-batch</em> 中，<em>mask</em> 是不共享的，所以 <em>mask</em> 的共享和参数共享还是有区别的，<em>dropout mask</em> 的共享是在每一个迭代中发生的，不同的迭代输入的数据不同，为了体现数据的差异性，要保证 <em>dropout mask</em> 不一致。</p>
<h4 id="2-3-3-3-嵌入-Dropout"><a href="#2-3-3-3-嵌入-Dropout" class="headerlink" title="2.3.3.3 嵌入 Dropout"></a>2.3.3.3 嵌入 Dropout</h4><p>对嵌入层引入 <em>Dropout</em>，实际上是在词级上操作的，即随机将一些词给去掉，这些被去掉的词的向量值就全为 0，并且在前向和反向传播中都保持这样的操作。对其余没有丢掉的词，用 $\frac{1}{1-p_e}$ 缩放其向量值，$p_e$ 为 <em>Dropout</em> 的比例。</p>
<h4 id="2-3-3-4-权重绑定"><a href="#2-3-3-4-权重绑定" class="headerlink" title="2.3.3.4 权重绑定"></a>2.3.3.4 权重绑定</h4><p>共享 <em>embedding</em> 层和 <em>softmax</em> 层，可以降低模型总参数。语言模型的最后输出也是 $|\mathcal{V}|$ 维，是预测词表中每个词的概率，从模型设计上来看嵌入层和最后输出层的参数矩阵的维度是很容易保证一致的，而从语言模型的特性上来看两个矩阵之间也是有一定的联系，所以作者选择共享嵌入层和输出层的权重矩阵，这种方法在 <em>seq2seq</em> 中也经常用到。</p>
<h4 id="2-3-3-5-减小嵌入尺寸"><a href="#2-3-3-5-减小嵌入尺寸" class="headerlink" title="2.3.3.5 减小嵌入尺寸"></a>2.3.3.5 减小嵌入尺寸</h4><p>减少语言模型的总参数量的最简单方法是降低词向量的尺寸，尽管这无助于缓解过拟合。论文作者修改了第一个和最后一个 LSTM 层的输入和输出维度，以和降低了的嵌入尺寸保持一致。</p>
<h4 id="2-3-3-6-激活正则化和时域激活正则化"><a href="#2-3-3-6-激活正则化和时域激活正则化" class="headerlink" title="2.3.3.6 激活正则化和时域激活正则化"></a>2.3.3.6 激活正则化和时域激活正则化</h4><p>常见的正则化技术除了 <em>Dropout</em> 以外还有 $L_2$ 正则化。坐着在模型中不仅用了 <em>Dropout</em> 还用了 $L_2$ 正则化，$L_2$ 正则化分成两部分：</p>
<ul>
<li><p>对每个单独的 $h_t$ ，用于惩罚明显过大的值。这部分称之为 <em>Activation Regularization</em>：</p>
<script type="math/tex; mode=display">
\alpha L_2(m \odot h_t)</script><p>其中 $m$ 为 <em>dropout mask</em>，$\alpha$ 为缩放系数。</p>
</li>
<li><p>对 $h_t$ 和 $h_{t+1}$ 之间的差值，用于惩罚隐状态变动过大，称之为 <em>Temporal Activation Regularization</em>。这一步很容易理解，$h_{t}$ 包含了之前的所有信息，$h_{t+1}$ 不仅包含了之前的所有信息，还包含了当前信息。一个通顺的句子包含的信息应该是平滑的，不会因为某个词的出现大规模改变隐状态。如果两个连续的隐状态之间出现了较大的差别很可能是训练过程出现了问题，所以通过 $L_2$ 正则化进行修正：</p>
<script type="math/tex; mode=display">
\beta L_2(h_t-h_{h+1})</script></li>
</ul>
<h1 id="3-总结"><a href="#3-总结" class="headerlink" title="3. 总结"></a>3. 总结</h1><p>本文介绍了 LSTM 语言模型，尤其着重介绍了 AWD-LSTM 语言模型。LSTM 作为在 <em>Transformer</em> 出现之前最优秀的序列建模的模型一直是 NLP 中的王者，实际上即使是 <em>Transformer</em> 在众多任务中的表现强于 LSTM，但是 LSTM  在序列位置捕捉能力上还是强于 <em>Transformer</em>。本文不仅包含了 LSTM 语言建模的思路 ，也介绍了多种非常有用的序列建模的优化方法。 </p>
<h1 id="4-Reference"><a href="#4-Reference" class="headerlink" title="4. Reference"></a>4. Reference</h1><ol>
<li><p><em>Morin, F., Bengio, Y.</em>，<a href="http://www-labs.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf" target="_blank" rel="noopener">Hierarchical Probabilistic Neural Network Language Model</a></p>
</li>
<li><p><em>Goodman, J.,</em> <a href="https://ieeexplore.ieee.org/document/940893" target="_blank" rel="noopener">Classes for fast maximum entropy training</a></p>
</li>
<li><p><em>Martin Sundermeyer, Ralf Schluter, and Hermann Ney</em>，<a href="http://www-i6.informatik.rwth-aachen.de/publications/download/820/Sundermeyer-2012.pdf" target="_blank" rel="noopener">LSTM Neural Networks for Language Modeling</a> </p>
</li>
<li><p><em>Stephen Merity, Nitish Shirish Keskar, Richard Socher</em>，<a href="https://arxiv.org/pdf/1708.02182.pdf" target="_blank" rel="noopener">Regularizing and Optimizing LSTM Language Models</a></p>
</li>
<li><p><em>Li Wan, Matthew Zeiler, Sixin Zhang, Yann Le Cun, Rob Fergus</em>，<a href="http://proceedings.mlr.press/v28/wan13.pdf" target="_blank" rel="noopener">Regularization of Neural Networks using DropConnect</a></p>
</li>
<li><p><em>Yashu Seth</em>, <a href="https://yashuseth.blog/2018/09/12/awd-lstm-explanation-understanding-language-model/" target="_blank" rel="noopener">What makes the AWD-LSTM great?</a></p>
</li>
<li><p><a href="https://www.cnblogs.com/jiangxinyang/p/13125519.html" target="_blank" rel="noopener">语言模型系列（一）——AWD-LSTM</a></p>
</li>
</ol>

        </div>
        
          


  <section class='meta' id="footer-meta">
    <hr>
    <div class='new-meta-box'>
      
        
          <div class="new-meta-item date" itemprop="dateUpdated" datetime="2021-08-23T01:05:27+08:00">
  <a class='notlink'>
    <i class="fas fa-clock" aria-hidden="true"></i>
    <p>最后更新于 2021年8月23日</p>
  </a>
</div>

        
      
        
          
  
  <div class="new-meta-item meta-tags"><a class="tag" href="/tags/nlp/" rel="nofollow"><i class="fas fa-hashtag" aria-hidden="true"></i>&nbsp;<p>NLP</p></a></div> <div class="new-meta-item meta-tags"><a class="tag" href="/tags/lstmlm/" rel="nofollow"><i class="fas fa-hashtag" aria-hidden="true"></i>&nbsp;<p>LSTMLM</p></a></div>


        
      
        
          
  <div class="new-meta-item share -mob-share-list">
  <div class="-mob-share-list share-body">
    
      
        <a class="-mob-share-qq" title="QQ好友" rel="external nofollow noopener noreferrer"
          
          href="http://connect.qq.com/widget/shareqq/index.html?url=https://rogerspy.gitee.io/2021/03/31/neural-language-model-lstm/&title=预训练语言模型-神经网络语言模型：LSTMLM | Rogerspy's Home&summary=1. 简介Mikolov 等人提出的 RNN 语言模型解决了前馈神经网络语言模型的语序问题。但是由于 RNN 神经网络本身存在着长程依赖问题，导致 RNN 语言模型很难学到距离较远的信息。"
          
          >
          
            <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/qq.png">
          
        </a>
      
    
      
        <a class="-mob-share-qzone" title="QQ空间" rel="external nofollow noopener noreferrer"
          
          href="https://sns.qzone.qq.com/cgi-bin/qzshare/cgi_qzshare_onekey?url=https://rogerspy.gitee.io/2021/03/31/neural-language-model-lstm/&title=预训练语言模型-神经网络语言模型：LSTMLM | Rogerspy's Home&summary=1. 简介Mikolov 等人提出的 RNN 语言模型解决了前馈神经网络语言模型的语序问题。但是由于 RNN 神经网络本身存在着长程依赖问题，导致 RNN 语言模型很难学到距离较远的信息。"
          
          >
          
            <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/qzone.png">
          
        </a>
      
    
      
        <a class='qrcode' rel="external nofollow noopener noreferrer" href=''>
        
          <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/wechat.png">
        
        </a>
      
    
      
        <a class="-mob-share-weibo" title="微博" rel="external nofollow noopener noreferrer"
          
          href="http://service.weibo.com/share/share.php?url=https://rogerspy.gitee.io/2021/03/31/neural-language-model-lstm/&title=预训练语言模型-神经网络语言模型：LSTMLM | Rogerspy's Home&summary=1. 简介Mikolov 等人提出的 RNN 语言模型解决了前馈神经网络语言模型的语序问题。但是由于 RNN 神经网络本身存在着长程依赖问题，导致 RNN 语言模型很难学到距离较远的信息。"
          
          >
          
            <img src="https://cdn.jsdelivr.net/gh/xaoxuu/assets@19.1.9/logo/128/weibo.png">
          
        </a>
      
    
  </div>
</div>



        
      
    </div>
  </section>


        
        
            <div class="prev-next">
                
                    <section class="prev">
                        <span class="art-item-left">
                            <h6><i class="fas fa-chevron-left" aria-hidden="true"></i>&nbsp;上一页</h6>
                            <h4>
                                <a href="/2021/04/06/Implicit_Regularization/" rel="prev" title="随机梯度下降中隐式正则化的起源">
                                  
                                      随机梯度下降中隐式正则化的起源
                                  
                                </a>
                            </h4>
                            
                                
                                <h6 class="tags">
                                    <a class="tag" href="/tags/nlp/"><i class="fas fa-hashtag fa-fw" aria-hidden="true"></i>NLP</a> <a class="tag" href="/tags/隐式正则化/"><i class="fas fa-hashtag fa-fw" aria-hidden="true"></i>隐式正则化</a>
                                </h6>
                            
                        </span>
                    </section>
                
                
                    <section class="next">
                        <span class="art-item-right" aria-hidden="true">
                            <h6>下一页&nbsp;<i class="fas fa-chevron-right" aria-hidden="true"></i></h6>
                            <h4>
                                <a href="/2021/03/24/neural-language-model-rnn/" rel="prev" title="预训练语言模型-神经网络语言模型：RNNLM">
                                    
                                        预训练语言模型-神经网络语言模型：RNNLM
                                    
                                </a>
                            </h4>
                            
                                
                                <h6 class="tags">
                                    <a class="tag" href="/tags/nlp/"><i class="fas fa-hashtag fa-fw" aria-hidden="true"></i>NLP</a> <a class="tag" href="/tags/rnnlm/"><i class="fas fa-hashtag fa-fw" aria-hidden="true"></i>RNNLM</a>
                                </h6>
                            
                        </span>
                    </section>
                
            </div>
        
      </section>
    </article>
  

  
    <!-- 显示推荐文章和评论 -->



  <article class="post white-box comments">
    <section class="article typo">
      <h4><i class="fas fa-comments fa-fw" aria-hidden="true"></i>&nbsp;评论</h4>
      
      
      
        <section id="comments">
          <div id="gitalk-container"></div>
        </section>
      
      
    </section>
  </article>


  




<!-- 根据页面mathjax变量决定是否加载MathJax数学公式js -->

  <!-- MathJax配置，可通过单美元符号书写行内公式等 -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    "HTML-CSS": {
      preferredFont: "TeX",
      availableFonts: ["STIX","TeX"],
      linebreaks: { automatic:true },
      EqnChunk: (MathJax.Hub.Browser.isMobile ? 10 : 50)
    },
    tex2jax: {
      inlineMath: [ ["$", "$"], ["\\(","\\)"] ],
      processEscapes: true,
      ignoreClass: "tex2jax_ignore|dno",
      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
    },
    TeX: {
      equationNumbers: { autoNumber: "AMS" },
      noUndefined: { attributes: { mathcolor: "red", mathbackground: "#FFEEEE", mathsize: "90%" } },
      Macros: { href: "{}" }
    },
    messageStyle: "none"
  });
</script>
<!-- 给MathJax元素添加has-jax class -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Queue(function() {
    var all = MathJax.Hub.getAllJax(), i;
    for(i=0; i < all.length; i += 1) {
      all[i].SourceElement().parentNode.className += (all[i].SourceElement().parentNode.className ? ' ' : '') + 'has-jax';
    }
  });
</script>
<!-- 通过连接CDN加载MathJax的js代码 -->
<script type="text/javascript" async
  src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML">
</script>




  <script>
    window.subData = {
      title: '预训练语言模型-神经网络语言模型：LSTMLM',
      tools: true
    }
  </script>


</div>
<aside class='l_side'>
  
    
    
      
        
          
          
            <section class='widget shake author'>
  <div class='content pure'>
    
      <div class='avatar'>
        <img class='avatar' src='https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/65-1Z31313530JC.jpeg'/>
      </div>
    
    
    
      <div class="social-wrapper">
        
          
            <a href="/atom.xml"
              class="social fas fa-rss flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
          
            <a href="mailto:rogerspy@163.com"
              class="social fas fa-envelope flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
          
            <a href="https://github.com/rogerspy"
              class="social fab fa-github flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
          
            <a href="https://music.163.com/#/user/home?id=1960721923"
              class="social fas fa-headphones-alt flat-btn"
              target="_blank"
              rel="external nofollow noopener noreferrer">
            </a>
          
        
      </div>
    
  </div>
</section>

          
        
      
        
          
          
            
  <section class='widget toc-wrapper'>
    
<header class='pure'>
  <div><i class="fas fa-list fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;本文目录</div>
  
    <div class='wrapper'><a class="s-toc rightBtn" rel="external nofollow noopener noreferrer" href="javascript:void(0)"><i class="fas fa-thumbtack fa-fw"></i></a></div>
  
</header>

    <div class='content pure'>
      <ol class="toc"><li class="toc-item toc-level-1"><a class="toc-link" href="#1-简介"><span class="toc-text">1. 简介</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#2-LSTM-语言模型"><span class="toc-text">2. LSTM 语言模型</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#2-1-LSTM-神经网络简介"><span class="toc-text">2.1 LSTM 神经网络简介</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-2-LSTM-语言模型"><span class="toc-text">2.2 LSTM 语言模型</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-3-AWD-LSTM-语言模型"><span class="toc-text">2.3 AWD-LSTM 语言模型</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#2-3-1-weight-dropped-LSTM"><span class="toc-text">2.3.1 weight-dropped LSTM</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#2-3-2-Non-monotonically-Triggered-ASGD"><span class="toc-text">2.3.2 Non-monotonically Triggered ASGD</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#2-3-3-其他正则化方法"><span class="toc-text">2.3.3 其他正则化方法</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-3-1-可变长度反向传播序列"><span class="toc-text">2.3.3.1 可变长度反向传播序列</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-3-2-变分-Dropout"><span class="toc-text">2.3.3.2 变分 Dropout</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-3-3-嵌入-Dropout"><span class="toc-text">2.3.3.3 嵌入 Dropout</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-3-4-权重绑定"><span class="toc-text">2.3.3.4 权重绑定</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-3-5-减小嵌入尺寸"><span class="toc-text">2.3.3.5 减小嵌入尺寸</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-3-6-激活正则化和时域激活正则化"><span class="toc-text">2.3.3.6 激活正则化和时域激活正则化</span></a></li></ol></li></ol></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#3-总结"><span class="toc-text">3. 总结</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#4-Reference"><span class="toc-text">4. Reference</span></a></li></ol>
    </div>
  </section>


          
        
      
        
          
          
            <section class='widget grid'>
  
<header class='pure'>
  <div><i class="fas fa-map-signs fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;站内导航</div>
  
</header>

  <div class='content pure'>
    <ul class="grid navgation">
      
        <li><a class="flat-box" " href="/"
          
          
          id="home">
          
            <i class="fas fa-clock fa-fw" aria-hidden="true"></i>
          
          近期文章
        </a></li>
      
        <li><a class="flat-box" " href="/blog/"
          
          
          id="blog">
          
            <i class="fas fa-edit fa-fw" aria-hidden="true"></i>
          
          我的博客
        </a></li>
      
        <li><a class="flat-box" " href="/paper_note/"
          
          
          id="paper_note">
          
            <i class="fas fa-book fa-fw" aria-hidden="true"></i>
          
          论文笔记
        </a></li>
      
        <li><a class="flat-box" " href="/algorithm/"
          
          
          id="algorithm">
          
            <i class="fas fa-cube fa-fw" aria-hidden="true"></i>
          
          算法基础
        </a></li>
      
        <li><a class="flat-box" " href="/leetcode/"
          
          
          id="leetcode">
          
            <i class="fas fa-code fa-fw" aria-hidden="true"></i>
          
          Leetcode
        </a></li>
      
        <li><a class="flat-box" " href="/video/"
          
          
          id="video">
          
            <i class="fas fa-film fa-fw" aria-hidden="true"></i>
          
          视频小站
        </a></li>
      
        <li><a class="flat-box" " href="/material/"
          
          
          id="material">
          
            <i class="fas fa-briefcase fa-fw" aria-hidden="true"></i>
          
          学习资料
        </a></li>
      
        <li><a class="flat-box" " href="/dataset/"
          
          
          id="dataset">
          
            <i class="fas fa-database fa-fw" aria-hidden="true"></i>
          
          数据集
        </a></li>
      
        <li><a class="flat-box" " href="/articles/"
          
          
          id="articles">
          
            <i class="fas fa-sticky-note fa-fw" aria-hidden="true"></i>
          
          杂文天地
        </a></li>
      
        <li><a class="flat-box" " href="/blog/archives/"
          
            rel="nofollow"
          
          
          id="blogarchives">
          
            <i class="fas fa-archive fa-fw" aria-hidden="true"></i>
          
          文章归档
        </a></li>
      
        <li><a class="flat-box" " href="/personal_center/"
          
          
          id="personal_center">
          
            <i class="fas fa-university fa-fw" aria-hidden="true"></i>
          
          个人中心
        </a></li>
      
        <li><a class="flat-box" " href="/about/"
          
            rel="nofollow"
          
          
          id="about">
          
            <i class="fas fa-info-circle fa-fw" aria-hidden="true"></i>
          
          关于小站
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            <section class='widget list'>
  
<header class='pure'>
  <div><i class="fas fa-terminal fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;机器学习框架</div>
  
</header>

  <div class='content pure'>
    <ul class="entry">
      
        <li><a class="flat-box" title="https://rogerspy.gitee.io/pytorch-zh/" href="https://rogerspy.gitee.io/pytorch-zh/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;PyTorch 中文文档
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://keras-zh.readthedocs.io/" href="https://keras-zh.readthedocs.io/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Keras 中文文档
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://tensorflow.google.cn/" href="https://tensorflow.google.cn/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Tensorflow 中文文档
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="http://scikitlearn.com.cn/" href="http://scikitlearn.com.cn/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-star fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Scikit Learn 中文文档
          </div>
          
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            <section class='widget list'>
  
<header class='pure'>
  <div><i class="fas fa-wrench fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;百宝箱</div>
  
</header>

  <div class='content pure'>
    <ul class="entry">
      
        <li><a class="flat-box" title="https://rogerspy.github.io/excalidraw-claymate/" href="https://rogerspy.github.io/excalidraw-claymate/"
          
          
            target="_blank"
          
          >
          <div class='name'>
            
              <i class="fas fa-magic fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Excalidraw-Claymate
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://rogerspy.github.io/jupyterlite/" href="https://rogerspy.github.io/jupyterlite/"
          
          
            target="_blank"
          
          >
          <div class='name'>
            
              <i class="fas fa-terminal fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;JupyterLite
          </div>
          
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            <section class='widget list'>
  
<header class='pure'>
  <div><i class="fas fa-eye fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;睁眼看世界</div>
  
</header>

  <div class='content pure'>
    <ul class="entry">
      
        <li><a class="flat-box" title="https://deeplearn.org/" href="https://deeplearn.org/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Deep Learning Monitor
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://paperswithcode.com/sota" href="https://paperswithcode.com/sota"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Browse State-of-the-Art
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://huggingface.co/transformers/" href="https://huggingface.co/transformers/"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Transformers
          </div>
          
        </a></li>
      
        <li><a class="flat-box" title="https://huggingface.co/models" href="https://huggingface.co/models"
          
          
          >
          <div class='name'>
            
              <i class="fas fa-link fa-fw" aria-hidden="true"></i>
            
            &nbsp;&nbsp;Transformers-models
          </div>
          
        </a></li>
      
    </ul>
  </div>
</section>

          
        
      
        
          
          
            
  <section class='widget category'>
    
<header class='pure'>
  <div><i class="fas fa-folder-open fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;文章分类</div>
  
    <a class="rightBtn"
    
      rel="nofollow"
    
    
    href="/categories/"
    title="categories/">
    <i class="fas fa-expand-arrows-alt fa-fw"></i></a>
  
</header>

    <div class='content pure'>
      <ul class="entry">
        
          <li><a class="flat-box" title="/categories/nl2sql/" href="/categories/nl2sql/"><div class='name'>NL2SQL</div><div class='badge'>(1)</div></a></li>
        
          <li><a class="flat-box" title="/categories/nlp/" href="/categories/nlp/"><div class='name'>NLP</div><div class='badge'>(23)</div></a></li>
        
          <li><a class="flat-box" title="/categories/博客转载/" href="/categories/博客转载/"><div class='name'>博客转载</div><div class='badge'>(5)</div></a></li>
        
          <li><a class="flat-box" title="/categories/数据结构与算法/" href="/categories/数据结构与算法/"><div class='name'>数据结构与算法</div><div class='badge'>(11)</div></a></li>
        
          <li><a class="flat-box" title="/categories/知识图谱/" href="/categories/知识图谱/"><div class='name'>知识图谱</div><div class='badge'>(3)</div></a></li>
        
          <li><a class="flat-box" title="/categories/论文解读/" href="/categories/论文解读/"><div class='name'>论文解读</div><div class='badge'>(2)</div></a></li>
        
          <li><a class="flat-box" title="/categories/语言模型/" href="/categories/语言模型/"><div class='name'>语言模型</div><div class='badge'>(10)</div></a></li>
        
      </ul>
    </div>
  </section>


          
        
      
        
          
          
            
  <section class='widget tagcloud'>
    
<header class='pure'>
  <div><i class="fas fa-fire fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;热门标签</div>
  
    <a class="rightBtn"
    
      rel="nofollow"
    
    
    href="/tags/"
    title="tags/">
    <i class="fas fa-expand-arrows-alt fa-fw"></i></a>
  
</header>

    <div class='content pure'>
      <a href="/tags/attention/" style="font-size: 16.86px; color: #868686">Attention</a> <a href="/tags/cnnlm/" style="font-size: 14px; color: #999">CNNLM</a> <a href="/tags/data-structure/" style="font-size: 14px; color: #999">Data Structure</a> <a href="/tags/deep/" style="font-size: 14px; color: #999">Deep</a> <a href="/tags/ffnnlm/" style="font-size: 14px; color: #999">FFNNLM</a> <a href="/tags/gaussian/" style="font-size: 14px; color: #999">Gaussian</a> <a href="/tags/initialization/" style="font-size: 14px; color: #999">Initialization</a> <a href="/tags/kg/" style="font-size: 16.86px; color: #868686">KG</a> <a href="/tags/lstm/" style="font-size: 14px; color: #999">LSTM</a> <a href="/tags/lstmlm/" style="font-size: 14px; color: #999">LSTMLM</a> <a href="/tags/language-model/" style="font-size: 16.86px; color: #868686">Language Model</a> <a href="/tags/log-linear-language-model/" style="font-size: 14px; color: #999">Log-Linear Language Model</a> <a href="/tags/nlp/" style="font-size: 19.71px; color: #727272">NLP</a> <a href="/tags/nmt/" style="font-size: 22.57px; color: #5f5f5f">NMT</a> <a href="/tags/norm/" style="font-size: 14px; color: #999">Norm</a> <a href="/tags/probabilistic-language-model/" style="font-size: 14px; color: #999">Probabilistic Language Model</a> <a href="/tags/rnnlm/" style="font-size: 14px; color: #999">RNNLM</a> <a href="/tags/roc-auc/" style="font-size: 14px; color: #999">ROC-AUC</a> <a href="/tags/transformer/" style="font-size: 24px; color: #555">Transformer</a> <a href="/tags/context2vec/" style="font-size: 14px; color: #999">context2vec</a> <a href="/tags/divide-conquer/" style="font-size: 14px; color: #999">divide-conquer</a> <a href="/tags/insertion/" style="font-size: 16.86px; color: #868686">insertion</a> <a href="/tags/insertion-deletion/" style="font-size: 15.43px; color: #8f8f8f">insertion-deletion</a> <a href="/tags/knowledge-modelling/" style="font-size: 15.43px; color: #8f8f8f">knowledge-modelling</a> <a href="/tags/nl2infographic/" style="font-size: 14px; color: #999">nl2infographic</a> <a href="/tags/nl2sql/" style="font-size: 14px; color: #999">nl2sql</a> <a href="/tags/ontology/" style="font-size: 14px; color: #999">ontology</a> <a href="/tags/parallel-recurrent/" style="font-size: 14px; color: #999">parallel-recurrent</a> <a href="/tags/pytorch/" style="font-size: 14px; color: #999">pytorch</a> <a href="/tags/queue/" style="font-size: 18.29px; color: #7c7c7c">queue</a> <a href="/tags/sparse/" style="font-size: 14px; color: #999">sparse</a> <a href="/tags/stack/" style="font-size: 14px; color: #999">stack</a> <a href="/tags/tensorflow/" style="font-size: 14px; color: #999">tensorflow</a> <a href="/tags/text2viz/" style="font-size: 14px; color: #999">text2viz</a> <a href="/tags/weighted-head/" style="font-size: 14px; color: #999">weighted-head</a> <a href="/tags/半监督语言模型/" style="font-size: 14px; color: #999">半监督语言模型</a> <a href="/tags/双数组前缀树/" style="font-size: 14px; color: #999">双数组前缀树</a> <a href="/tags/推荐系统/" style="font-size: 14px; color: #999">推荐系统</a> <a href="/tags/数据结构/" style="font-size: 21.14px; color: #686868">数据结构</a> <a href="/tags/数组/" style="font-size: 14px; color: #999">数组</a> <a href="/tags/时间复杂度/" style="font-size: 14px; color: #999">时间复杂度</a> <a href="/tags/算法/" style="font-size: 14px; color: #999">算法</a> <a href="/tags/评估方法/" style="font-size: 14px; color: #999">评估方法</a> <a href="/tags/词向量/" style="font-size: 14px; color: #999">词向量</a> <a href="/tags/隐式正则化/" style="font-size: 14px; color: #999">隐式正则化</a>
    </div>
  </section>


          
        
      
        
          
          
            


  <section class='widget music'>
    
<header class='pure'>
  <div><i class="fas fa-compact-disc fa-fw" aria-hidden="true"></i>&nbsp;&nbsp;最近在听</div>
  
    <a class="rightBtn"
    
      rel="external nofollow noopener noreferrer"
    
    
      target="_blank"
    
    href="https://music.163.com/#/user/home?id=1960721923"
    title="https://music.163.com/#/user/home?id=1960721923">
    <i class="far fa-heart fa-fw"></i></a>
  
</header>

    <div class='content pure'>
      
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/aplayer@1.7.0/dist/APlayer.min.css">
  <div class="aplayer"
    data-theme="#1BCDFC"
    
    
    data-mode="circulation"
    data-server="netease"
    data-type="playlist"
    data-id="2957571193"
    data-volume="0.7">
  </div>
  <script src="https://cdn.jsdelivr.net/npm/aplayer@1.7.0/dist/APlayer.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/meting@1.1.0/dist/Meting.min.js"></script>


    </div>
  </section>


          
        
      
    

  
</aside>

<footer id="footer" class="clearfix">
  <div id="sitetime"></div>
  
  
    <div class="social-wrapper">
      
        
          <a href="/atom.xml"
            class="social fas fa-rss flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
        
          <a href="mailto:rogerspy@163.com"
            class="social fas fa-envelope flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
        
          <a href="https://github.com/rogerspy"
            class="social fab fa-github flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
        
          <a href="https://music.163.com/#/user/home?id=1960721923"
            class="social fas fa-headphones-alt flat-btn"
            target="_blank"
            rel="external nofollow noopener noreferrer">
          </a>
        
      
    </div>
  
  <br>
  <div><p>博客内容遵循 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/deed.zh">署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0) 协议</a></p>
</div>
  <div>
    本站使用
    <a href="https://xaoxuu.com/wiki/material-x/" target="_blank" class="codename">Material X</a>
    作为主题
    
      ，
      总访问量为
      <span id="busuanzi_value_site_pv"><i class="fas fa-spinner fa-spin fa-fw" aria-hidden="true"></i></span>
      次
    
    。
  </div>
	</footer>

<script>setLoadingBarProgress(80);</script>
<!-- 点击特效，输入特效 运行时间 -->
<script type="text/javascript" src="/cool/cooltext.js"></script>
<script type="text/javascript" src="/cool/clicklove.js"></script>
<script type="text/javascript" src="/cool/sitetime.js"></script>



      <script>setLoadingBarProgress(60);</script>
    </div>
    <a class="s-top fas fa-arrow-up fa-fw" href='javascript:void(0)'></a>
  </div>
  <script src="https://cdn.jsdelivr.net/npm/jquery@3.3.1/dist/jquery.min.js"></script>

  <script>
    var GOOGLE_CUSTOM_SEARCH_API_KEY = "";
    var GOOGLE_CUSTOM_SEARCH_ENGINE_ID = "";
    var ALGOLIA_API_KEY = "";
    var ALGOLIA_APP_ID = "";
    var ALGOLIA_INDEX_NAME = "";
    var AZURE_SERVICE_NAME = "";
    var AZURE_INDEX_NAME = "";
    var AZURE_QUERY_KEY = "";
    var BAIDU_API_ID = "";
    var SEARCH_SERVICE = "hexo" || "hexo";
    var ROOT = "/"||"/";
    if(!ROOT.endsWith('/'))ROOT += '/';
  </script>

<script src="//instant.page/1.2.2" type="module" integrity="sha384-2xV8M5griQmzyiY3CDqh1dn4z3llDVqZDqzjzcY+jCBCk/a5fXJmuZ/40JJAPeoU"></script>


  <script async src="https://cdn.jsdelivr.net/npm/scrollreveal@4.0.5/dist/scrollreveal.min.js"></script>
  <script type="text/javascript">
    $(function() {
      const $reveal = $('.reveal');
      if ($reveal.length === 0) return;
      const sr = ScrollReveal({ distance: 0 });
      sr.reveal('.reveal');
    });
  </script>


  <script src="https://cdn.jsdelivr.net/npm/node-waves@0.7.6/dist/waves.min.js"></script>
  <script type="text/javascript">
    $(function() {
      Waves.attach('.flat-btn', ['waves-button']);
      Waves.attach('.float-btn', ['waves-button', 'waves-float']);
      Waves.attach('.float-btn-light', ['waves-button', 'waves-float', 'waves-light']);
      Waves.attach('.flat-box', ['waves-block']);
      Waves.attach('.float-box', ['waves-block', 'waves-float']);
      Waves.attach('.waves-image');
      Waves.init();
    });
  </script>


  <script async src="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-busuanzi@2.3/js/busuanzi.pure.mini.js"></script>




  
  
  
    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery-backstretch/2.0.4/jquery.backstretch.min.js"></script>
    <script type="text/javascript">
      $(function(){
        if ('.cover') {
          $('.cover').backstretch(
          ["https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/a0c9e6f9efad8b731cb7376504bd10d79d2053.jpg"],
          {
            duration: "6000",
            fade: "2500"
          });
        } else {
          $.backstretch(
          ["https://cdn.jsdelivr.net/gh/rogerspy/blog-imgs/a0c9e6f9efad8b731cb7376504bd10d79d2053.jpg"],
          {
            duration: "6000",
            fade: "2500"
          });
        }
      });
    </script>
  







  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.css">
  <script src="https://cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.min.js"></script>
  <script type="text/javascript">
    var gitalk = new Gitalk({
      clientID: "35a5e4dc744cc7d162af",
      clientSecret: "7b5a409e17ce0c1971f284eac9f8902eb4b8feba",
      repo: "rogerspy.github.io",
      owner: "Rogerspy",
      admin: "Rogerspy",
      
        id: "/wiki/material-x/",
      
      distractionFreeMode: false  // Facebook-like distraction free mode
    });
    gitalk.render('gitalk-container');
  </script>





  <script src="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-material-x@19.5/js/app.js"></script>


  <script src="https://cdn.jsdelivr.net/gh/xaoxuu/cdn-material-x@19.5/js/search.js"></script>




<!-- 复制 -->
<script src="https://cdn.jsdelivr.net/npm/clipboard@2/dist/clipboard.min.js"></script>
<script>
  let COPY_SUCCESS = "复制成功";
  let COPY_FAILURE = "复制失败";
  /*页面载入完成后，创建复制按钮*/
  !function (e, t, a) {
    /* code */
    var initCopyCode = function(){
      var copyHtml = '';
      copyHtml += '<button class="btn-copy" data-clipboard-snippet="">';
      copyHtml += '  <i class="fa fa-copy"></i><span>复制</span>';
      copyHtml += '</button>';
      $(".highlight .code pre").before(copyHtml);
      var clipboard = new ClipboardJS('.btn-copy', {
        target: function(trigger) {
          return trigger.nextElementSibling;
        }
      });

      clipboard.on('success', function(e) {
        //您可以加入成功提示
        console.info('Action:', e.action);
        console.info('Text:', e.text);
        console.info('Trigger:', e.trigger);
        success_prompt(COPY_SUCCESS);
        e.clearSelection();
      });
      clipboard.on('error', function(e) {
        //您可以加入失败提示
        console.error('Action:', e.action);
        console.error('Trigger:', e.trigger);
        fail_prompt(COPY_FAILURE);
      });
    }
    initCopyCode();

  }(window, document);

  /**
   * 弹出式提示框，默认1.5秒自动消失
   * @param message 提示信息
   * @param style 提示样式，有alert-success、alert-danger、alert-warning、alert-info
   * @param time 消失时间
   */
  var prompt = function (message, style, time)
  {
      style = (style === undefined) ? 'alert-success' : style;
      time = (time === undefined) ? 1500 : time*1000;
      $('<div>')
          .appendTo('body')
          .addClass('alert ' + style)
          .html(message)
          .show()
          .delay(time)
          .fadeOut();
  };

  // 成功提示
  var success_prompt = function(message, time)
  {
      prompt(message, 'alert-success', time);
  };

  // 失败提示
  var fail_prompt = function(message, time)
  {
      prompt(message, 'alert-danger', time);
  };

  // 提醒
  var warning_prompt = function(message, time)
  {
      prompt(message, 'alert-warning', time);
  };

  // 信息提示
  var info_prompt = function(message, time)
  {
      prompt(message, 'alert-info', time);
  };

</script>


<!-- fancybox -->
<script src="https://cdn.jsdelivr.net/gh/fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.js"></script>
<script>
  let LAZY_LOAD_IMAGE = "";
  $(".article-entry").find("fancybox").find("img").each(function () {
      var element = document.createElement("a");
      $(element).attr("data-fancybox", "gallery");
      $(element).attr("href", $(this).attr("src"));
      /* 图片采用懒加载处理时,
       * 一般图片标签内会有个属性名来存放图片的真实地址，比如 data-original,
       * 那么此处将原本的属性名src替换为对应属性名data-original,
       * 修改如下
       */
       if (LAZY_LOAD_IMAGE) {
         $(element).attr("href", $(this).attr("data-original"));
       }
      $(this).wrap(element);
  });
</script>





  <script>setLoadingBarProgress(100);</script>
</body>
</html>
